Loading libraries

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
library(ggplot2)
library(countrycode)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
Sys.setenv(MAPBOX_TOKEN = 11122223333444) #presevents the mapbox token error
data = read.csv('owid-covid-data.csv')

#head(data)

Data preprocessing, checking on the column names and determine the important ones to help me in achieving the objectives, checking for null/NAN values.

Write the column names to a text file for my reference in the analysis

columns <- colnames(data)
file_columns<-file("columns.txt")
writeLines(c(columns), file_columns)
close(file_columns) #close the file

Check columns with the null values

#which(is.na(data))
#cols_with_na <- which(apply(data, 2, function(x) any(is.na(x))))
#colnames(data)[cols_with_na] #this indicates that atleast there are missing records in one or more rows in each feature included in the dataset.Except iso_code, continent, location and date
#colSums(is.na(data))

Visualizing the total reported cases since the start of the pandemic

df <- data
# show difference between paths and lines
p <- df %>%
  arrange(total_cases) %>%
  plot_ly(x = ~date, y = ~total_cases)
add_lines(p)

Considering the new tests

p <- df %>%
  arrange(new_tests) %>%
  plot_ly(x = ~date, y = ~new_tests) %>% 
add_lines(p)

A notable issue for the visualization is that although there were millions of reported cases, the number of new tests was relatively low, implying that not all countries could have accounted to new tests but had highest number of reported cases.

To confirm the validity of this conclusion, the following visualization digs deep into highlighting the top countries with most cases and most new tests

#group the dataset into years and months
dates <- c(df$date)
months <- month(ymd(dates))
years <- year(ymd(dates))
#new df
new_df <- data.frame(Month = months, Year = years, Continent = df$continent, total_cases = df$total_cases)

#create a plot for top 10 continents
top10 <- new_df %>% 
  group_by(Continent, Year) %>%
  summarise(totals = sum(total_cases, na.rm = TRUE)) %>%
  filter(!is.na(Continent)) %>%
  arrange(desc(totals)) %>%
  top_n(10) 
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.
## Selecting by totals
top10 %>%
  plot_ly(x = ~Continent, y = ~totals, type = "bar")
#use the new datasets with fixed latitudes for better map visualization.
new_data <- read.csv("coronavirus.csv")
save(new_data, file = "corona.RData")
head(new_data)
##         date province country     lat      long      type cases   uid iso2 iso3
## 1 2020-01-22  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
## 2 2020-01-23  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
## 3 2020-01-24  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
## 4 2020-01-25  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
## 5 2020-01-26  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
## 6 2020-01-27  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
##   code3    combined_key population continent_name continent_code
## 1   124 Alberta, Canada    4413146  North America           <NA>
## 2   124 Alberta, Canada    4413146  North America           <NA>
## 3   124 Alberta, Canada    4413146  North America           <NA>
## 4   124 Alberta, Canada    4413146  North America           <NA>
## 5   124 Alberta, Canada    4413146  North America           <NA>
## 6   124 Alberta, Canada    4413146  North America           <NA>
#get the leading countries in terms of the number of cases confirmed

filtered_df <- new_data %>% 
  filter(type == "confirmed") %>%
  group_by(country) %>%
  summarise(sum_Cases = sum(cases)) %>%
  arrange(-sum_Cases)%>%
  top_n(10) #order the total in ascending order
## Selecting by sum_Cases
plt <- ggplot(filtered_df, aes(x = country, y=sum_Cases)) + geom_bar(stat="identity", fill="steelblue")+
  theme_minimal()
ggplotly(plt)
library(shiny)
#creating a chroloplot
dates = c(new_data$date)
Date = ymd(dates)

#new_data$code3 <- countrycode(new_data$country, "country.name", "iso3c") #convert the names into 3 letter country codes

cases_summed <- new_data %>% 
  mutate(Year = year(Date), Month = month(Date)) %>%
  mutate(YearMonth = paste(format(as.Date(paste(Month, 1, Year), "%m %d %Y"), "%b %Y"), sep = " ")) %>%
  group_by(country, YearMonth, iso3) %>% 
  filter(cases > 0) %>% #used to filter all cases recorded in negative
  summarise(cases_sum = sum(cases))
## `summarise()` has grouped output by 'country', 'YearMonth'. You can override
## using the `.groups` argument.
#cases seem to have been recorded with negative values

#get minimum and maximum values
min_value <- min(cases_summed$cases_sum)
max_value <- max(cases_summed$cases_sum)

cat("Minimum value in cases_sum column:", min_value, "\n")
## Minimum value in cases_sum column: 1
cat("Maximum value in cases_sum column:", max_value, "\n")
## Maximum value in cases_sum column: 20417765
# Define a function to assign colors to countries
assign_colors <- function(country) {
  colors <- c("red", "blue", "green", "yellow", "purple")
  country_colors <- ifelse(country %in% c("country1", "country2", "country3"), colors[1], 
                          ifelse(country %in% c("country4", "country5", "country6"), colors[2],
                          ifelse(country %in% c("country7", "country8", "country9"), colors[3],
                          ifelse(country %in% c("country10", "country11", "country12"), colors[4],
                          ifelse(country %in% c("country13", "country14", "country15"), colors[5], 
                          "grey")))))
  return(country_colors)
}

# Define the UI for the Shiny app
ui <- fluidPage(
  titlePanel("Cases by Country"),
  sidebarLayout(
    sidebarPanel(
      selectInput("month", "Month:", c("All", unique(cases_summed$YearMonth)))
    ),
    mainPanel(
      plotlyOutput("choropleth")
    )
  )
)

# Define the server logic for the Shiny app
server <- function(input, output) {
  
  # Filter the data based on the selected month and year
  filtered_data <- reactive({
    if (input$month == "All") {
      cases_summed
    } else {
      cases_summed %>% filter(YearMonth == input$month)
    }
  })
  
  # Plot the choropleth map
  output$choropleth <- renderPlotly({
    filtered_data() %>%
      mutate(color = assign_colors(country)) %>%
      plot_ly(z = ~cases_sum, text = ~country, locations = ~iso3, locationmode = "ISO-3", type = "choropleth",
              color = ~color) %>%
      colorbar(title = "Total Covid-reported cases ") %>%
      layout(title = "Cases by Country", geo = list(showframe = FALSE, showcoastlines = FALSE))
  })
  
}

# Run the Shiny app
shinyApp(ui = ui, server = server)
## PhantomJS not found. You can install it with webshot::install_phantomjs(). If it is installed, please make sure the phantomjs executable can be found via the PATH variable.
Shiny applications not supported in static R Markdown documents